Findings

library(tidyverse)
## ── Attaching packages ───────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

## Load data
load('01_parsed.Rdata')

Finding: 25% of programs account for about 50% of (permanent) placements

ggplot(univ_df, aes(perm_placement_rank, frac_cum_perm_placements)) + 
    geom_step() +
    scale_x_continuous(labels = scales::percent_format(), 
                       name = 'PhD Programs') +
    scale_y_continuous(labels = scales::percent_format(), 
                       name = 'Permanent Placements')
## Warning: Removed 20 rows containing missing values (geom_path).

Build network

## NB Only permanent placements
hiring_network = individual_df %>%
    filter(permanent) %>%
    select(placing_univ_id, hiring_univ_id, everything()) %>%
    graph_from_data_frame(directed = TRUE, 
                          vertices = univ_df)

## 1 giant component contains almost all of the schools
components(hiring_network)$csize
##   [1]   1   1   1   1   1 786   1   1   2   1   1   1   1   1   1   1   1
##  [18]   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [35]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [52]   1   1   1   1   1   1   4   1   1   1   1   1   1   1   1   1   1
##  [69]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
##  [86]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [103]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [120]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [137]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [154]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [171]   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [188]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [205]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [222]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [239]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [256]   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
## [273]   1   1   1   1   1   1   1

Out- and in-centrality

set.seed(42)
V(hiring_network)$in_centrality = eigen_centrality(hiring_network, 
                                                   directed = TRUE,
                                                   weights = NA)$vector
graph.reverse <- function (graph) {
    if (!is.directed(graph))
        return(graph)
    e <- get.data.frame(graph, what="edges")
    ## swap "from" & "to"
    neworder <- 1:length(e)
    neworder[1:2] <- c(2,1)
    e <- e[neworder]
    names(e) <- names(e)[neworder]
    vertex_df = get.data.frame(graph, what = 'vertices')
    if (ncol(vertex_df) > 0) {
        return(graph.data.frame(e, vertices = vertex_df))
    } else {
        return(graph.data.frame(e))
    }
}
V(hiring_network)$out_centrality = hiring_network %>%
    graph.reverse() %>%
    eigen_centrality(directed = TRUE, weights = NA) %>%
    .$vector

## Add centrality statistics to the university df
univ_df = univ_df %>%
    mutate(out_centrality = V(hiring_network)[univ_df$univ_id]$out_centrality, 
           in_centrality = V(hiring_network)[univ_df$univ_id]$in_centrality)

## Check relationship btwn unweighted multiedges and weighted edges
## Strong correlation, esp by approx rank and high/low prestige
## But some movement, both numerically and ordinally
# E(hiring_network)$weight = count_multiple(hiring_network)
# hiring_network_simp = simplify(hiring_network)
# 
# set.seed(42)
# V(hiring_network_simp)$out_centrality = hiring_network_simp %>%
#     graph.reverse() %>%
#     eigen_centrality(directed = TRUE) %>%
#     .$vector
# 
# tibble(name = V(hiring_network)$univ_name,
#        multi = V(hiring_network)$out_centrality,
#        simp = V(hiring_network_simp)$out_centrality) %>%
#     ggplot(aes(log10(simp), log10(multi))) + 
#     geom_point() +
#     stat_function(fun = function (x) x)

Exploring centrality scores

There are two clear groups of centrality scores, with high scores (in the range of 10^-5 to 1) and low scores (10^-15 and smaller).

##
## NB there seem to be (small?) differences in scores (at the low end?) across runs of the centrality algorithm
ggplot(univ_df, aes(out_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_density).

ggplot(univ_df, aes(in_centrality)) + 
    geom_density() + geom_rug() +
    scale_x_continuous(trans = 'log10')
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing non-finite values (stat_density).

ggplot(univ_df, aes(out_centrality, in_centrality, 
                    color = cluster, 
                    text = univ_name)) +
    geom_jitter() + 
    scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis
# ## Check stability of centrality calculation
# iterated_centrality = 1:500 %>%
#     map(~ {hiring_network %>%
#             graph.reverse() %>%
#             eigen_centrality(directed = TRUE, weights = NA) %>%
#             .$vector}) %>%
#     transpose() %>%
#     map(unlist) %>%
#     map(~ tibble(out_centrality = .)) %>%
#     bind_rows(.id = 'univ_id') %>%
#     group_by(univ_id) %>%
#     summarize(min = min(out_centrality),
#               max = max(out_centrality),
#               median = median(out_centrality))
# 
# ## Lots of variation among low-prestige
# ggplot(iterated_centrality, 
#        aes(reorder(univ_id, median), 
#            median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()
# 
# ## But extremely stable results among high-prestige
# iterated_centrality %>%
#     filter(min > 10^-11) %>%
#     ggplot(aes(reorder(univ_id, median), 
#                median)) + 
#     geom_pointrange(aes(ymin = min, ymax = max)) + 
#     scale_y_log10()

We completely rewire the network, preserving out-degree (number of new PhDs) and in-degree (number of permanent hires), then calculate out-centrality again. The plots below show the out-centrality distributions for each random rewiring, with the actual distribution in red. The distributions are always bimodal, indicating that the observed bimodiality is due in part to the way PhD production and hiring are distributed across institutions. However, the high-centrality subset is never as small as in the actual hiring network. This indicates that bimodiality is not due only to the degree distributions. Some other factor is exacerbating social inequality in the hiring network.

set.seed(78910)
map(1:100, 
    ~ sample_degseq(degree(hiring_network, mode = 'out'), 
                    degree(hiring_network, mode = 'in'))
) %>%
    map(graph.reverse) %>%
    map(eigen_centrality, directed = TRUE, weights = NULL) %>%
    map(~ .$vector) %>%
    map(log10) %>%
    map(density) %>%
    map(~ tibble(x = .$x, y = .$y)) %>%
    bind_rows(.id = 'iteration') %>%
    ggplot(aes(x, y)) + 
    geom_line(aes(group = iteration), alpha = .5) + 
    stat_density(data = univ_df, geom = 'line',
                 aes(log10(out_centrality), ..density..), 
                 color = 'red', size = 1) +
    xlim(NA, 0)
## Warning: Removed 11 rows containing non-finite values (stat_density).
## Warning: Removed 6472 rows containing missing values (geom_path).

Finding: High output is only modestly correlated w/ centrality. Programs like ND, CUNY, New School produce lots of PhDs, but they aren’t placed into the high-centrality departments.

ggplot(univ_df, aes(total_placements, log10(out_centrality)
                    )) +
    geom_point(aes(text = univ_name))
## Warning: Ignoring unknown aesthetics: text
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
univ_df %>%
    mutate(out_centrality_log = log10(out_centrality)) %>%
    filter(!is.na(out_centrality_log) & 
               out_centrality > 0 &
               !is.na(total_placements)) %>%
    select(total_placements, out_centrality_log) %>%
    cor()
##                    total_placements out_centrality_log
## total_placements          1.0000000          0.3614052
## out_centrality_log        0.3614052          1.0000000
ggplot(univ_df, aes(perm_placement_rank, log10(out_centrality))) +
    geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).

ggplot(univ_df, aes(perm_placement_rate, log10(out_centrality), 
                    text = univ_name)) + 
    geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Movement w/in high-prestige group
# individual_df %>% 
#     filter(permanent) %>%
#     left_join(select(univ_df, univ_id, out_centrality), 
#               by = c('placing_univ_id' = 'univ_id')) %>%
#     left_join(select(univ_df, univ_id, out_centrality),
#               by = c('hiring_univ_id' = 'univ_id')) %>%
#     filter(out_centrality.y > 10^-10) %>%
#     select(person_id, 
#            placing = out_centrality.x, 
#            hiring = out_centrality.y) %>%
#     gather(variable, value, -person_id) %>%
#     mutate(variable = fct_rev(variable)) %>%
#     ggplot(aes(variable, log10(value))) + 
#     geom_point() +
#     geom_line(aes(group = person_id))

## Diagonal line indicates where hiring program has the same centrality as the placing program.  
## Most placements are below this line, indicating that the centrality measure captures the idea that people typically are hired by programs with lower status
## The few placements above the line indicate that, even when individuals are hired by programs with higher status, the increase is relatively minor:  no more than 1 point on the log10 scale
individual_df %>% 
    filter(permanent) %>%
    left_join(select(univ_df, univ_id, out_centrality), 
              by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(select(univ_df, univ_id, out_centrality),
              by = c('hiring_univ_id' = 'univ_id')) %>%
    filter(out_centrality.y > 10^-10) %>%
    select(person_id, 
           placing = out_centrality.x, 
           hiring = out_centrality.y) %>%
    ggplot(aes(log10(placing), log10(hiring))) + 
    geom_point() + 
    stat_function(fun = function (x) x)

Community detection

## steps = 26 has low values for both entropy of cluster sizes (high delta H) and total # clusters
## but somewhere along the way this started producing >300 clusters? 
## bc ~300 connected components
## Extract giant component
hiring_network_gc = hiring_network %>%
    components %>% 
    .$csize %>%
    {which(. == max(.))} %>%
    {components(hiring_network)$membership == .} %>%
    which() %>%
    induced_subgraph(hiring_network, .)

# walk_len = rep(2:100, 1)
# # ## NB Takes a minute or so
# cluster_stats = walk_len %>%
#   map(~ cluster_walktrap(hiring_network_gc, steps = .x)) %>%
#   map(~ list(sizes = sizes(.x), length = length(.x))) %>%
#   map_dfr(~ tibble(H = -sum(.x$sizes/sum(.x$sizes) * log2(.x$sizes/sum(.x$sizes))),
#                    n_clusters = .x$length)) %>%
#   mutate(walk_len = as.factor(walk_len),
#          delta_H = log2(n_clusters) - H)
# 
# ggplot(cluster_stats, aes(walk_len, delta_H)) +
#     geom_point() +
#     coord_flip()
# ggplot(cluster_stats, aes(walk_len, n_clusters)) +
#     geom_point() +
#     coord_flip()
# ggplot(cluster_stats, aes(n_clusters, delta_H)) +
#   geom_text(aes(label = walk_len)) +
#   scale_x_continuous()


communities = cluster_walktrap(hiring_network_gc, steps = 26)
V(hiring_network_gc)$community = membership(communities)
univ_df = univ_df %>%
    left_join({hiring_network_gc %>%
            as_data_frame(what = 'vertices') %>% 
            select(univ_id = name, community) %>%
            mutate(community = as.character(community))})
## Joining, by = "univ_id"

Finding: There is no correlation between semantic clusters and topological communities.

univ_df %>%
    filter(!is.na(community), !is.na(cluster)) %>%
    select(community, cluster) %>%
    table() %>%
    chisq.test(simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  .
## X-squared = 120.65, df = NA, p-value = 0.4788
univ_df %>%
    filter(!is.na(community), !is.na(cluster)) %>%
    count(community, cluster) %>%
    rename(cluster_n = n) %>%
    group_by(community) %>%
    mutate(community_tot = sum(cluster_n), 
           cluster_frac = cluster_n / community_tot, 
           H = sum(cluster_frac * log2(cluster_frac))) %>%
    ggplot(aes(fct_reorder(community, H, .desc = TRUE),
               cluster_n, fill = cluster)) + 
    geom_col() + 
    coord_flip() +
    xlab('Topological communities') +
    ylab('# of schools in community') +
    scale_fill_brewer(palette = 'Set1', 
                       name = 'Semantic\nclusters')

High-prestige universities

## Start w/ Oxford, and work upstream
## Only need to go 8 steps to get closure
1:25 %>%
    map(~ make_ego_graph(hiring_network, order = .x, 
                         nodes = '512', mode = 'in')) %>%
    flatten() %>%
    map(~ length(V(.x))) %>%
    tibble(order = 1:length(.), 
           size = unlist(.)) %>%
    ggplot(aes(order, size)) + geom_point()

prestigious = make_ego_graph(hiring_network, order = 8, 
                        nodes = '512', 
                        mode = 'in') %>%
    .[[1]]

## How large is the high-prestige community?  
## 61 programs; 6% of all programs in the network; 
## 39% of programs with at least 1 placement in the dataset
length(V(prestigious))
## [1] 61
length(V(prestigious)) / length(V(hiring_network))
## [1] 0.05700935
length(V(prestigious)) / sum(!is.na(univ_df$total_placements))
## [1] 0.388535
## What fraction of hires are within high-prestige?  
## 15% of all permanent hires; 7% of all hires
length(E(prestigious)) / length(E(hiring_network))
## [1] 0.1542219
length(E(prestigious)) / nrow(individual_df)
## [1] 0.07828551
# png(file = '02_prestigious_net.png', 
#     width = 11, height = 11, units = 'in', res = 400)
ggraph(prestigious) + 
    geom_node_label(aes(label = univ_name, 
                        size = log10(out_centrality))) + 
    geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')), 
                  alpha = .25,
                  spread = 5) +
    scale_size_continuous(range = c(.5, 3)) +
    theme_graph()
## Using `nicely` as default layout

# dev.off()

## High-prestige = high centrality group
univ_df = univ_df %>%
    mutate(prestige = ifelse(univ_id %in% V(prestigious)$name, 
                                'high-prestige', 
                                'low-prestige'))

ggplot(univ_df, aes(prestige, log10(out_centrality))) + 
    geom_jitter()

## Prestige status and clusters
## High-prestige basically don't appear in clusters 2, 3, 7
univ_df %>%
    filter(!is.na(cluster)) %>%
    ggplot(aes(cluster, color = prestige)) + 
    geom_point(stat = 'count') +
    geom_line(aes(group = prestige), stat = 'count')

## High-prestige are mostly confined to the 3 large communities
univ_df %>%
    filter(!is.na(community)) %>%
    ggplot(aes(as.integer(community), fill = prestige)) + 
    geom_bar()

## What fraction of high-prestige graduates end up in high-prestige programs? 
## 221 / (221 + 569) = 28% of those w/ permanent placements
individual_df %>%
    filter(permanent) %>%
    left_join(univ_df, by = c('placing_univ_id' = 'univ_id')) %>%
    left_join(univ_df, by = c('hiring_univ_id' = 'univ_id')) %>%
    select(prestige.x, prestige.y) %>%
    table()
##                prestige.y
## prestige.x      high-prestige low-prestige
##   high-prestige           221          569
##   low-prestige              0          643

Finding: Median permanent placement rate for high-prestige programs is 14 points higher than for low-prestige programs. However, variation is also wide within each group; even among high-prestige programs, median permanent placement rate is only 58%. This is also not yet controlling for graduation year, area, or demographics.

ggplot(univ_df, aes(prestige, perm_placement_rate, 
                    label = univ_name)) + 
    geom_boxplot(color = 'red') +
    geom_jitter() +
    scale_y_continuous(labels = scales::percent_format())
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).
## Warning: Removed 913 rows containing missing values (geom_point).

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: Removed 913 rows containing non-finite values (stat_boxplot).

Stability of prestige status

By randomly rewiring the network, we can test the stability of the prestige categories to data errors and the short time frame of our data. In each of 500 permutations, we randomly rewire 10% of the edges in the permanent hiring network, then calculate out-centralities on the rewired network. Using a threshold of 10^-7 for high-prestige status, we count the fraction of rewired networks in which each program is high-prestige.

system.time({
    set.seed(13579)
    permutations = 1:500 %>%
        ## Rewire 10% of edges
        map(~ {hiring_network %>%
                rewire(keeping_degseq(loops = TRUE, 
                                      niter = floor(.05 * length(E(.)))))}) %>%
        ## Calculate out-centralities
        map(~ {.x %>%
                graph.reverse() %>%
                eigen_centrality(directed = TRUE, weights = NA) %>%
                .$vector}) %>%
        transpose() %>%
        map(unlist) %>%
        ## Fraction where program is high-prestige
        map(~ sum(. > 10^-7) / length(.)) %>%
        map(~ tibble(frac_high_prestige = .)) %>%
        bind_rows(.id = 'univ_id')
})
##    user  system elapsed 
##   6.932   0.424   8.665
## frac_high_prestige indicates the fraction of permutations in which the program was high-prestige
ggplot(permutations, aes(frac_high_prestige)) + 
    geom_density() + 
    geom_rug()

univ_df = left_join(univ_df, permutations)
## Joining, by = "univ_id"

Finding: High-prestige status is highly stable; actual high-prestige programs are almost always (>90%) high-prestige in the rewired networks.

Finding: A number of actual non-high-prestige programs are often (>50%) high-prestige in the rewired networks.
- Notre Dame (86%) - New School (71%) - Penn State (65%) - Tulane (62%) - Boston College (61%) - UC Boulder (56%)

Finding: Within the actual categories, there is no correlation between counterfactual high-prestige status and either (a) out-centrality and (b) the number of permanent placements.

ggplot(univ_df, aes(log10(out_centrality), frac_high_prestige, 
                    text = univ_name)) + 
    geom_point()

plotly::ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
ggplot(univ_df, aes(perm_placements, frac_high_prestige)) + 
    geom_point()
## Warning: Removed 913 rows containing missing values (geom_point).

Among low-status programs, a likely explanation is the size of the program’s total downstream network. Notre Dame’s is fairly large, with many potential PhDs; if rewiring leads any of these PhDs to a high-status position, both their PhD institution and Notre Dame join the high-status group. UC Boulder’s downstream network is smaller, with fewer opportunities to place into a high-status position.

make_ego_graph(hiring_network, order = 10, 
               nodes = c('314', '260'),
               mode = 'out') %>% 
    disjoint_union() %>%
    induced_subgraph(., 
                     which(!is.na(V(.)$perm_placements))) %>%
    ggraph() + 
    geom_node_label(aes(label = univ_name, 
                        size = perm_placements)) + 
    geom_edge_fan(arrow = arrow(angle = 45, 
                                length = unit(.1, 'inches'), 
                                type = 'closed')) +
    scale_label_size(range = c(0, 1.5), name = 'placements') + 
    theme_graph()
## Warning in disjoint_union(.): Duplicate vertex names in disjoint union
## Using `nicely` as default layout

Plotting

Coarser community structure

large_clusters = which(sizes(communities) > 100)
V(hiring_network_gc)$community_coarse = ifelse(
    V(hiring_network_gc)$community %in% large_clusters, 
    V(hiring_network_gc)$community, 
    'other')

FR

hiring_network_gc %>%
    induced_subgraph(which(degree(., mode = 'out') > 10)) %>%
    ggraph() +
    # geom_node_label(aes(label = univ_name, 
    #                     size = log10(1 + out_centrality), 
    #                     color = as.factor(community))) +
    geom_edge_fan(arrow = arrow(length = unit(.01, 'npc')), 
                  spread = 5) +
    geom_node_point(aes(size = log10(1 + out_centrality), 
                        # color = as.factor(community))) +
                        color = community_coarse)) +
    scale_color_brewer(palette = 'Set1', guide = FALSE) +
    scale_size(range = c(2, 6)) +
    theme_graph()
## Using `nicely` as default layout

Chord diagram

hiring_network_gc %>%
    induced_subgraph(which(degree(., mode = 'out') > 1)) %>%
    ggraph(layout = 'linear', sort.by = 'community_coarse', circular = TRUE) +
    geom_edge_arc(arrow = arrow(length = unit(.01, 'npc')), alpha = .1) +
    geom_node_point(aes(size = log10(1 + out_centrality), 
                        # color = as.factor(community))) +
                        color = community_coarse)) +
    scale_color_brewer(palette = 'Set1', guide = FALSE) +
    theme_graph()

## Save university-level data with network statistics
save(univ_df, file = '02_univ_net_stats.Rdata')